{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "# Related keywords detection"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipynb) notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from aips import get_engine\n",
    "from aips.spark import create_view_from_collection\n",
    "\n",
    "spark = SparkSession.builder.appName(\"AIPS\").getOrCreate()\n",
    "engine = get_engine(\"solr\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1: Prepare the data using py-spark and data frames \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_keyword_user_pairs():\n",
    "    pairs_dataframe = spark.sql(\"\"\"SELECT * FROM user_searches\n",
    "                                   ORDER BY user ASC, keyword ASC\"\"\")\n",
    "    print(\"Number of keyword user pairs:\", pairs_dataframe.count())\n",
    "    print(\"\\nKeyword user pairs derived from signals:\")\n",
    "    for pair in pairs_dataframe.head(3):\n",
    "        print(f'User \"{pair[\"user\"]}\" searched for \"{pair[\"keyword\"]}\"')        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of keyword user pairs: 730459\n",
      "\n",
      "Keyword user pairs derived from signals:\n",
      "User \"u10\" searched for \"joy stick\"\n",
      "User \"u10\" searched for \"xbox\"\n",
      "User \"u10\" searched for \"xbox360\"\n"
     ]
    }
   ],
   "source": [
    "signals_collection = engine.get_collection(\"signals\")\n",
    "create_view_from_collection(signals_collection, \"signals\")\n",
    "query = \"\"\"SELECT LOWER(searches.target) AS keyword, searches.user\n",
    "           FROM signals AS searches\n",
    "           WHERE searches.type='query'\"\"\"\n",
    "spark.sql(query).createOrReplaceTempView(\"user_searches\")\n",
    "print_keyword_user_pairs()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step2 : Create Cooccurrence & PMI2  Model based on users searchs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_keyword_cooccurrences():\n",
    "    keyword_searches = spark.sql(\"\"\"SELECT * FROM keywords_users_oc\n",
    "                                    ORDER BY users_occ DESC\"\"\")\n",
    "    keyword_searches.show(10)\n",
    "    keyword_pair_searches = spark.sql(\"\"\"SELECT * FROM keywords_users_cooc\n",
    "                                         ORDER BY users_cooc DESC, keyword1 ASC\"\"\")\n",
    "    print(\"Number of co-occurring keyword searches:\", keyword_pair_searches.count(), \"\\n\")\n",
    "    keyword_pair_searches.show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+---------+\n",
      "|    keyword|users_occ|\n",
      "+-----------+---------+\n",
      "|     lcd tv|     8449|\n",
      "|       ipad|     7749|\n",
      "|hp touchpad|     7144|\n",
      "|  iphone 4s|     4642|\n",
      "|   touchpad|     4019|\n",
      "|     laptop|     3625|\n",
      "|    laptops|     3435|\n",
      "|      beats|     3282|\n",
      "|       ipod|     3164|\n",
      "| ipod touch|     2992|\n",
      "+-----------+---------+\n",
      "only showing top 10 rows\n",
      "\n",
      "Number of co-occurring keyword searches: 244876 \n",
      "\n",
      "+-------------+---------------+----------+\n",
      "|     keyword1|       keyword2|users_cooc|\n",
      "+-------------+---------------+----------+\n",
      "|green lantern|captain america|        23|\n",
      "|    iphone 4s|         iphone|        21|\n",
      "|       laptop|      hp laptop|        20|\n",
      "|         thor|captain america|        18|\n",
      "|         bose|          beats|        17|\n",
      "|    iphone 4s|       iphone 4|        17|\n",
      "|   skullcandy|          beats|        17|\n",
      "|      laptops|         laptop|        16|\n",
      "|      macbook|            mac|        16|\n",
      "|         thor|  green lantern|        16|\n",
      "+-------------+---------------+----------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "query = \"\"\"SELECT k1.keyword AS keyword1, k2.keyword AS keyword2,\n",
    "           COUNT(DISTINCT k1.user) users_cooc\n",
    "           FROM user_searches k1\n",
    "           JOIN user_searches k2 ON k1.user = k2.user\n",
    "           WHERE k1.keyword > k2.keyword\n",
    "           GROUP BY k1.keyword, k2.keyword\"\"\"\n",
    "spark.sql(query).createOrReplaceTempView(\"keywords_users_cooc\")\n",
    "\n",
    "query = \"\"\"SELECT keyword, COUNT(DISTINCT user) users_occ FROM\n",
    "           user_searches GROUP BY keyword\"\"\"\n",
    "spark.sql(query).createOrReplaceTempView(\"keywords_users_oc\")\n",
    "print_keyword_cooccurrences()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"\"\"\n",
    "SELECT k1.keyword AS k1, k2.keyword AS k2, k1_k2.users_cooc,\n",
    "k1.users_occ AS n_users1, k2.users_occ AS n_users2,\n",
    "LOG(POW(k1_k2.users_cooc, 2) /\n",
    "    (k1.users_occ * k2.users_occ)) AS pmi2\n",
    "FROM keywords_users_cooc AS k1_k2 \n",
    "JOIN keywords_users_oc AS k1 ON k1_k2.keyword1 = k1.keyword\n",
    "JOIN keywords_users_oc AS k2 ON k1_k2.keyword2 = k2.keyword\"\"\"\n",
    "spark.sql(query).createOrReplaceTempView(\"user_related_keywords_pmi\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------+--------------------+----------+--------+--------+------+\n",
      "|               k1|                  k2|users_cooc|n_users1|n_users2|  pmi2|\n",
      "+-----------------+--------------------+----------+--------+--------+------+\n",
      "|  iphone 4s cases|      iphone 4 cases|        10|     158|     740|-7.064|\n",
      "|     sony laptops|          hp laptops|         8|     209|     432|-7.252|\n",
      "|otterbox iphone 4|            otterbox|         7|     122|     787| -7.58|\n",
      "|    green lantern|     captain america|        23|     963|    1091|-7.594|\n",
      "|          kenwood|              alpine|        13|     584|     717|-7.815|\n",
      "|      sony laptop|         dell laptop|        10|     620|     451|-7.936|\n",
      "|   wireless mouse|           godfather|         6|     407|     248|-7.939|\n",
      "|       hp laptops|        dell laptops|         6|     432|     269| -8.08|\n",
      "|      mp3 players|        dvd recorder|         6|     334|     365|-8.128|\n",
      "|          quicken|portable dvd players|         6|     281|     434|-8.128|\n",
      "+-----------------+--------------------+----------+--------+--------+------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#Unstable\n",
    "spark.sql(\"\"\"SELECT k1, k2, users_cooc, n_users1,\n",
    "                    n_users2, ROUND(pmi2, 3) AS pmi2\n",
    "             FROM user_related_keywords_pmi\n",
    "             WHERE users_cooc > 5 ORDER BY pmi2 DESC, k1 ASC\"\"\").show(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"\"\"\n",
    "SELECT *, (r1 + r2 / (r1 * r2)) / 2 AS comp_score\n",
    "FROM (\n",
    "  SELECT *, \n",
    "  RANK() OVER (PARTITION BY 1\n",
    "               ORDER BY users_cooc DESC) r1,\n",
    "  RANK() OVER (PARTITION BY 1\n",
    "               ORDER BY pmi2 DESC) r2  \n",
    "  FROM user_related_keywords_pmi)\"\"\"\n",
    "spark.sql(query).createOrReplaceTempView(\"users_related_keywords_comp_score\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------+---------------+----------+-------+---+------+----------+\n",
      "|           k1|             k2|users_cooc|   pmi2| r1|    r2|comp_score|\n",
      "+-------------+---------------+----------+-------+---+------+----------+\n",
      "|green lantern|captain america|        23| -7.594|  1|  8626|       1.0|\n",
      "|    iphone 4s|         iphone|        21|-10.217|  2| 56156|      1.25|\n",
      "|       laptop|      hp laptop|        20| -9.133|  3| 20383|     1.667|\n",
      "|         thor|captain america|        18| -8.483|  4| 13190|     2.125|\n",
      "|    iphone 4s|       iphone 4|        17|-10.076|  5| 51964|       2.6|\n",
      "|         bose|          beats|        17|-10.074|  5| 51916|       2.6|\n",
      "|   skullcandy|          beats|        17| -9.001|  5| 18792|       2.6|\n",
      "|      laptops|         laptop|        16|-10.792|  8| 80240|     4.063|\n",
      "|      macbook|            mac|        16| -9.891|  8| 45464|     4.063|\n",
      "|         thor|  green lantern|        16| -8.594|  8| 14074|     4.063|\n",
      "|   headphones|   beats by dre|        15| -9.989| 11| 49046|     5.545|\n",
      "|  macbook pro|        macbook|        15| -9.737| 11| 39448|     5.545|\n",
      "|  macbook air|        macbook|        15| -9.443| 11| 26943|     5.545|\n",
      "|   ipod touch|           ipad|        13|-11.829| 14|200871|     7.036|\n",
      "|       ipad 2|           ipad|        13|-11.765| 14|196829|     7.036|\n",
      "|         nook|         kindle|        13| -9.662| 14| 36232|     7.036|\n",
      "|  macbook pro|    macbook air|        13| -9.207| 14| 21301|     7.036|\n",
      "|      kenwood|         alpine|        13| -7.815| 14|  9502|     7.036|\n",
      "| beats by dre|          beats|        12|-10.814| 19| 82811|     9.526|\n",
      "|      macbook|          apple|        12|-10.466| 19| 62087|     9.526|\n",
      "+-------------+---------------+----------+-------+---+------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "spark.sql(\"\"\"SELECT k1, k2, users_cooc, ROUND(pmi2, 3) as pmi2,\n",
    "             r1, r2, ROUND(comp_score, 3) as comp_score \n",
    "             FROM users_related_keywords_comp_score\n",
    "             ORDER BY comp_score ASC, pmi2 ASC\"\"\").show(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Create Cooccurrence & PMI2  Model based on product interaction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_signals_format():\n",
    "    print(\"Original signals format: \")\n",
    "    spark.sql(\"\"\"SELECT id, query_id, signal_time, target, type, user\n",
    "                 FROM signals WHERE type = 'query'\n",
    "                 ORDER BY id ASC\"\"\").show(3)\n",
    "    print(\"Simplified signals format: \")\n",
    "    spark.sql(\"\"\"SELECT * FROM keyword_click_product\n",
    "                 ORDER BY user ASC, product ASC\"\"\").show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original signals format: \n",
      "+--------------------+-----------+--------------------+-----------+-----+-------+\n",
      "|                  id|   query_id|         signal_time|     target| type|   user|\n",
      "+--------------------+-----------+--------------------+-----------+-----+-------+\n",
      "|00006813-afc5-4ee...|u676746_0_1|2019-11-26 06:11:...|PS3 headset|query|u676746|\n",
      "|00007696-32c4-4c0...|u565788_2_7|2020-05-07 08:47:...|  star trek|query|u565788|\n",
      "|00008143-884e-4d7...|u199301_0_1|2020-05-07 00:28:...|       bose|query|u199301|\n",
      "+--------------------+-----------+--------------------+-----------+-----+-------+\n",
      "only showing top 3 rows\n",
      "\n",
      "Simplified signals format: \n",
      "+-------------+----+------------+\n",
      "|      keyword|user|     product|\n",
      "+-------------+----+------------+\n",
      "|    joy stick| u10|097855018120|\n",
      "|         xbox| u10|885370235876|\n",
      "|virgin mobile|u100|799366521679|\n",
      "+-------------+----+------------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "query = \"\"\"SELECT LOWER(searches.target) AS keyword, searches.user AS user,\n",
    "           clicks.target AS product FROM signals AS searches\n",
    "           RIGHT JOIN signals AS clicks\n",
    "           ON searches.query_id = clicks.query_id \n",
    "           WHERE searches.type = 'query'\n",
    "           AND clicks.type = 'click'\"\"\"\n",
    "spark.sql(query).createOrReplaceTempView(\"keyword_click_product\")\n",
    "print_signals_format()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_keyword_pair_data():\n",
    "    dataframe = spark.sql(\"\"\"SELECT * FROM keyword_click_product_cooc\n",
    "                             ORDER BY n_products DESC, users_cooc DESC\"\"\")\n",
    "    print(\"Number of co-occurring queries:\", dataframe.count(), \"\\n\")\n",
    "    dataframe.show(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of co-occurring queries: 1579710 \n",
      "\n",
      "+--------------+-------------+--------+--------+----------+----------+\n",
      "|            k1|           k2|n_users1|n_users2|users_cooc|n_products|\n",
      "+--------------+-------------+--------+--------+----------+----------+\n",
      "|       laptops|       laptop|    3251|    3345|      6596|       187|\n",
      "|       tablets|       tablet|    1510|    1629|      3139|       155|\n",
      "|        tablet|         ipad|    1468|    7067|      8535|       146|\n",
      "|       tablets|         ipad|    1359|    7048|      8407|       132|\n",
      "|       cameras|       camera|     637|     688|      1325|       116|\n",
      "|          ipad|        apple|    6706|    1129|      7835|       111|\n",
      "|      iphone 4|       iphone|    1313|    1754|      3067|       108|\n",
      "|    headphones|  head phones|    1829|     492|      2321|       106|\n",
      "|        ipad 2|         ipad|    2736|    6738|      9474|        98|\n",
      "|     computers|     computer|     536|     392|       928|        98|\n",
      "|iphone 4 cases|iphone 4 case|     648|     810|      1458|        95|\n",
      "|       netbook|       laptop|    1017|    2887|      3904|        94|\n",
      "|        laptop|    computers|    2794|     349|      3143|        94|\n",
      "|       netbook|      laptops|    1018|    2781|      3799|        91|\n",
      "|    headphones|    headphone|    1617|     367|      1984|        90|\n",
      "|        laptop|           hp|    2078|     749|      2827|        89|\n",
      "|        tablet|    computers|    1124|     449|      1573|        89|\n",
      "|       laptops|    computers|    2734|     331|      3065|        88|\n",
      "|           mac|        apple|    1668|    1218|      2886|        88|\n",
      "|     tablet pc|       tablet|     296|    1408|      1704|        87|\n",
      "+--------------+-------------+--------+--------+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "query = \"\"\"\n",
    "SELECT k1.keyword AS k1, k2.keyword AS k2, SUM(p1) n_users1, sum(p2) n_users2,\n",
    "SUM(p1 + p2) AS users_cooc, COUNT(1) n_products FROM (\n",
    "  SELECT keyword, product, COUNT(1) AS p1 FROM keyword_click_product\n",
    "  GROUP BY keyword, product) AS k1 JOIN (\n",
    "  SELECT keyword, product, COUNT(1) AS p2 FROM keyword_click_product\n",
    "  GROUP BY keyword, product) AS k2 ON k1.product = k2.product\n",
    "WHERE k1.keyword > k2.keyword GROUP BY k1.keyword, k2.keyword\"\"\"\n",
    "spark.sql(query).createOrReplaceTempView(\"keyword_click_product_cooc\")\n",
    "print_keyword_pair_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.11"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_keyword_popularity():\n",
    "    dataframe = spark.sql(\"\"\"SELECT * FROM keyword_click_product_oc\n",
    "                             ORDER BY n_users DESC\"\"\")\n",
    "    print(\"Keyword searches that resulted in clicks:\", dataframe.count(), \"\\n\")\n",
    "    dataframe.show(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Keyword searches that resulted in clicks: 13744 \n",
      "\n",
      "+------------+-------+\n",
      "|     keyword|n_users|\n",
      "+------------+-------+\n",
      "|   star wars|   7564|\n",
      "|        ipad|   7554|\n",
      "| hp touchpad|   4829|\n",
      "|      lcd tv|   4606|\n",
      "|   iphone 4s|   4585|\n",
      "|      laptop|   3554|\n",
      "|       beats|   3498|\n",
      "|     laptops|   3369|\n",
      "|        ipod|   2949|\n",
      "|  ipod touch|   2931|\n",
      "|      ipad 2|   2842|\n",
      "|      kindle|   2833|\n",
      "|    touchpad|   2785|\n",
      "|      iphone|   2430|\n",
      "|beats by dre|   2328|\n",
      "|     macbook|   2313|\n",
      "|  headphones|   2270|\n",
      "|        bose|   2071|\n",
      "|         ps3|   2041|\n",
      "|         mac|   1851|\n",
      "+------------+-------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "query = \"\"\"SELECT keyword, COUNT(1) AS n_users FROM keyword_click_product\n",
    "           GROUP BY keyword\"\"\"\n",
    "spark.sql(query).createOrReplaceTempView(\"keyword_click_product_oc\")\n",
    "print_keyword_popularity()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate PMI2, per Listing 6.6\n",
    "spark.sql(\"\"\"\n",
    "SELECT k1.keyword AS k1, k2.keyword AS k2, k1_k2.users_cooc,\n",
    "k1.n_users AS n_users1, k2.n_users AS n_users2,\n",
    "LOG(POW(k1_k2.users_cooc, 2) /\n",
    "   (k1.n_users * k2.n_users)) AS pmi2\n",
    "FROM keyword_click_product_cooc AS k1_k2 \n",
    "JOIN keyword_click_product_oc AS k1 ON k1_k2.k1 = k1.keyword\n",
    "JOIN keyword_click_product_oc AS k2 ON k1_k2.k2 = k2.keyword\n",
    "\"\"\").createOrReplaceTempView(\"product_related_keywords_pmi\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate comp_score, per Listing 6.7\n",
    "spark.sql(\"\"\"\n",
    "SELECT *, (r1 + r2 / (r1 * r2)) / 2 as comp_score from (\n",
    "  SELECT *, \n",
    "    RANK() OVER (PARTITION BY 1 ORDER BY users_cooc DESC) r1, \n",
    "    RANK() OVER (PARTITION BY 1 ORDER BY pmi2 DESC) r2  \n",
    "FROM product_related_keywords_pmi)\n",
    "\"\"\").createOrReplaceTempView(\"product_related_keywords_comp_score\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.12"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of co-occurring queries: 1579710 \n",
      "\n",
      "+----------+-----------+--------+--------+-----+----------+\n",
      "|        k1|         k2|n_users1|n_users2| pmi2|comp_score|\n",
      "+----------+-----------+--------+--------+-----+----------+\n",
      "|      ipad|hp touchpad|    7554|    4829|1.232|       1.0|\n",
      "|    ipad 2|       ipad|    2842|    7554|1.431|      1.25|\n",
      "|    tablet|       ipad|    1818|    7554|1.669|     1.667|\n",
      "|  touchpad|       ipad|    2785|    7554|1.223|     2.125|\n",
      "|   tablets|       ipad|    1627|    7554|1.749|       2.6|\n",
      "|     ipad2|       ipad|    1254|    7554|1.903|     3.083|\n",
      "|      ipad|      apple|    7554|    1814|  1.5|     3.571|\n",
      "|  touchpad|hp touchpad|    2785|    4829|1.394|     4.063|\n",
      "|      ipad|  hp tablet|    7554|    1421|1.594|     4.556|\n",
      "|ipod touch|       ipad|    2931|    7554|0.863|      5.05|\n",
      "|      ipad|      i pad|    7554|     612|2.415|     5.545|\n",
      "|    kindle|       ipad|    2833|    7554|0.828|     6.042|\n",
      "|    laptop|       ipad|    3554|    7554|0.593|     6.538|\n",
      "|      ipad| apple ipad|    7554|     326|2.916|     7.036|\n",
      "|    ipad 2|hp touchpad|    2842|    4829|1.181|     7.533|\n",
      "|   laptops|     laptop|    3369|    3554| 1.29|     8.031|\n",
      "|      ipad|         hp|    7554|    1125|1.534|     8.529|\n",
      "|     ipads|       ipad|     254|    7554|3.015|     9.028|\n",
      "|      ipad|  htc flyer|    7554|    1834|1.016|     9.526|\n",
      "|      ipad|    i pad 2|    7554|     204| 3.18|    10.025|\n",
      "+----------+-----------+--------+--------+-----+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#Unstable, sometimes need to re-execute several times\n",
    "query = \"\"\"SELECT k1, k2, n_users1, n_users2, ROUND(pmi2, 3) AS pmi2,\n",
    "           ROUND(comp_score, 3) AS comp_score\n",
    "           FROM product_related_keywords_comp_score\n",
    "           ORDER BY comp_score ASC\"\"\"\n",
    "dataframe = spark.sql(query)\n",
    "print(\"Number of co-occurring queries:\", dataframe.count(), \"\\n\")\n",
    "dataframe.show(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Up next: [Misspelling detection and correction](../ch06/3.spell-correction.ipynb)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
